﻿#include "spider.h"
#include <QtNetwork>
#include <QTextCodec>
#include <QUrl>
#include <QDebug>
#include <QNetworkProxy>
#include <QEventLoop>
#include <QRegExp>
#include <QFile>
#include <QTextStream>

#pragma execution_character_set("utf-8")
//爬取 https://www.linuxprobe.com/books 书籍介绍
//

static QStringList g_list = {
    "https://www.linuxprobe.com/books/page/34",
    "https://www.linuxprobe.com/books/page/35",
    "https://www.linuxprobe.com/books/page/36",

};
static int g_index = 0;

Spider::Spider(QObject *parent) : QObject(parent)
{

    all = "";
    timer.setInterval(10000);
    timer.setSingleShot(true);

    manager = new QNetworkAccessManager(this);

#ifdef PROXY
    QNetworkProxy proxy;
    proxy.setType(QNetworkProxy::HttpProxy);
    proxy.setHostName("proxy.com");
    proxy.setPort(8080);
    proxy.setUser("xxxx");
    proxy.setPassword("xxxx");
    QNetworkProxy::setApplicationProxy(proxy);

    manager->setProxy(proxy);
#endif
    m_timerWork = new QTimer(this);
    connect(m_timerWork, &QTimer::timeout, this, &Spider::timeout);

    m_timerWork->setInterval(1000);
    m_timerWork->start();

    m_pageIndex = 1;
    m_pageMax = 100;
}

Spider::~Spider()
{
}

void Spider::httpReadyRead()
{
    //    QTextCodec *codec = QTextCodec::codecForLocale();
    //    QString data = codec->toUnicode(reply->readAll());
    QString data = QString(reply->readAll());
    all = all + data;
}

void Spider::writeFile(QString path, QString data, QIODevice::OpenModeFlag flag)
{
    //    qDebug()<<"file = "<< path;
    QFile file(path);

    if(!file.open(flag | QIODevice::Text)){
        qDebug()<<"write file error.";
        return;
    }

    QTextStream stream(&file);

    if(data.isEmpty())
        stream<<all;
    else
        stream<<data;

    stream.flush();

    file.close();
}

void Spider::get(QUrl url)
{
    QNetworkRequest request(url);

    request.setRawHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36");

    reply = manager->get(request);

    QEventLoop loop;
    connect(&timer, &QTimer::timeout, &loop, &QEventLoop::quit);
    connect(reply, &QNetworkReply::finished, &loop, &QEventLoop::quit);
    connect(reply, &QIODevice::readyRead, this, &Spider::httpReadyRead);

    timer.start();
    loop.exec();

    if(timer.isActive()){
        timer.stop();
        if(reply->error() != QNetworkReply::NoError){
            qDebug()<<"error "<<reply->errorString();
        }
        else{
            QVariant var = reply->attribute(QNetworkRequest::HttpStatusCodeAttribute);
            int nStatusCode = var.toInt();
            qDebug()<<"code: "<<nStatusCode;
        }
    }
    else{
        disconnect(reply, &QNetworkReply::finished, &loop, &QEventLoop::quit);
        disconnect(reply, &QNetworkReply::readyRead, this, &Spider::httpReadyRead);
        reply->abort();
        reply->deleteLater();
        qDebug()<<"timerout";
    }
}

void Spider::handleData()
{
    QStringList list;

    if(!all.isEmpty()){
        writeFile("test.txt", all);  //test
        /*
 * <h1 class="post-title"><a href
//        <h1 class="post-title"><a href="https://www.linuxprobe.com/vc-pdf.html" title="《VC++编程技术与难点剖析》pdf版电子书免费下载">
//        <span style="color:black"> 《VC++编程技术与难点剖析》pdf版电子书免费下载 </span>
//            </a></h1>
*/
        QRegExp rx("h1 class=\"post-title\"><a href=\".*\" title=\"(.*)\">");
        rx.setMinimal(true);
        int count = 0;
        int pos = 0;

        while((pos = rx.indexIn(all, pos)) != -1){
            ++count;
            pos += rx.matchedLength();

            list.append(rx.cap(1));
        }
    }

    if(!list.empty()){
        QString data;
        foreach (QString str, list) {
            data += str + "\n";
        }
        writeFile("list.txt", data, QIODevice::Append);
    }

    all = "";
}

void Spider::timeout()
{
    qDebug()<<"download page "<<m_pageIndex;
    if(1){
        if(m_pageIndex >= m_pageMax){
            m_timerWork->stop();
            delete this;
            qDebug()<<"work finish.";
            exit(0);
        }
        if(1 == m_pageIndex){
            get(QUrl("https://www.linuxprobe.com/books"));

            handleData();
        }
        else{
            qDebug()<<QString("https://www.linuxprobe.com/books/page/%1").arg(m_pageIndex);

            get(QUrl(QString("https://www.linuxprobe.com/books/page/%1").arg(m_pageIndex)));

            handleData();
        }
        m_pageIndex++;
    }

    if(0){
        //有一些超时失败的，重新爬取
        qDebug()<<g_list[g_index];
        get(QUrl(g_list[g_index]));
        handleData();
        g_index++;

        if(g_index >= g_list.length()){
            m_timerWork->stop();
            delete this;
            qDebug()<<"work finish.";
            exit(0);
        }
    }

}
